Genome-Wide Chromatin Accessibility Matrix
Sys.time()[1] "2022-09-22 23:59:58 CDT"
[1] "America/Chicago"
Preparation
PROJECT_DIR <- file.path(
"/Users/jialei/Dropbox/Data/Projects/UTSW",
"/Cardiomyopathy/atac-seq"
)Functions
Load required packages.
library(tidyverse)
## ── Attaching packages ────────────────────────────────── tidyverse 1.3.2.9000 ──
## ✔ ggplot2 3.3.6.9000 ✔ dplyr 1.0.99.9000
## ✔ tibble 3.1.8.9001 ✔ stringr 1.4.1.9000
## ✔ tidyr 1.2.1.9000 ✔ forcats 0.5.2.9000
## ✔ readr 2.1.2.9000 ✔ lubridate 1.8.0.9000
## ✔ purrr 0.9000.0.9000
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(Matrix)
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
library(patchwork)
library(extrafont)
## Registering fonts with R`%+replace%` <- ggplot2::`%+replace%`Python env
numpy version: 1.22.4
reticulate::py_config()python: /Users/jialei/.pyenv/shims/python
libpython: /Users/jialei/.pyenv/versions/mambaforge-4.10.3-10/lib/libpython3.9.dylib
pythonhome: /Users/jialei/.pyenv/versions/mambaforge-4.10.3-10:/Users/jialei/.pyenv/versions/mambaforge-4.10.3-10
version: 3.9.13 | packaged by conda-forge | (main, May 27 2022, 17:00:33) [Clang 13.0.1 ]
numpy: /Users/jialei/.pyenv/versions/mambaforge-4.10.3-10/lib/python3.9/site-packages/numpy
numpy_version: 1.22.4
numpy: /Users/jialei/.pyenv/versions/mambaforge-4.10.3-10/lib/python3.9/site-packages/numpy
NOTE: Python version was forced by RETICULATE_PYTHON
Preprocessing
Matrix
# load matrix
feature_counts_files <- list.files(
path = file.path(PROJECT_DIR, "defined_peaks/summarize_counts")
)
feature_counts_files <- feature_counts_files[
stringr::str_detect(feature_counts_files, pattern = "merged.+.bz2")
]
matrix_readcount_use <- purrr::map(feature_counts_files, \(x) {
dt <- data.table::fread(
file = file.path(PROJECT_DIR, "defined_peaks/summarize_counts", x)
)
features <- dt[["Geneid"]]
dt <- dt[, .SD, .SDcols = patterns("/project")]
colnames(dt) <- stringr::str_remove(
string = colnames(dt),
pattern = "_atacseq/alignments/Aligned_sorted_deduped_q10.bam"
) |>
stringr::str_remove(pattern = "^.+/")
m <- as.matrix(dt, sparse = TRUE)
rownames(m) <- features
return(m)
})
matrix_readcount_use <- purrr::reduce(matrix_readcount_use, cbind)
dim(matrix_readcount_use)[1] 207021 87
# rename samples
sample_ids <- tibble::tribble(
~old, ~new,
"heart_fresh1_1", "F1_1",
"heart_fresh1_2", "F1_2",
"heart_fresh2_1", "F2_1",
"heart_fresh2_2", "F2_2",
"heart_fresh5_1", "F5_1",
"heart_fresh5_2", "F5_2",
"heart_myectomy_p11_1", "HOCM11_1",
"heart_myectomy_p11_2", "HOCM11_2",
"heart_myectomy_p7_1", "HOCM7_1",
"heart_myectomy_p7_2", "HOCM7_2",
#
"heart_myectomy_p4_2", "MYEC4_2"
)
colnames(matrix_readcount_use) <- colnames(matrix_readcount_use) |>
tibble::enframe(value = "sample") |>
dplyr::left_join(sample_ids, by = c("sample" = "old")) |>
dplyr::mutate(
new = case_when(
is.na(new) ~ sample,
TRUE ~ new
),
new = stringr::str_remove(string = new, pattern = "heart_"),
#
new = case_when(
stringr::str_detect(
string = new,
pattern = "[hocm|Hocm|MYEC]"
) ~ stringr::str_to_upper(new),
TRUE ~ stringr::str_to_title(new)
)
) |>
dplyr::pull(new)# inspect matrix
matrix_readcount_use[1:5, 1:8] F1_1 F1_2 F2_1 F2_2 F5_1 F5_2 P3_1 P3_2
1_181358_181567 4 3 5 14 10 11 8 1
1_183716_183885 4 4 6 9 2 7 3 5
1_183999_184321 3 3 9 15 2 11 5 10
1_191239_191880 19 17 24 28 14 28 9 10
1_267894_268128 5 2 10 21 8 14 2 8
# filter blocked regions
blocked_regions <- read.table(
file = file.path(
dirname(PROJECT_DIR),
"misc/annotations",
"ENCFF419RSJ.bed.gz"
),
stringsAsFactors = FALSE
)
blocked_regions <- GenomicRanges::GRanges(
blocked_regions[, 1],
IRanges::IRanges(blocked_regions[, 2] + 1, blocked_regions[, 3])
)
if (ensembldb::seqlevelsStyle(blocked_regions) == "UCSC") {
ensembldb::seqlevelsStyle(blocked_regions) <- "Ensembl"
}
peak_regions <- stringr::str_split(
string = rownames(matrix_readcount_use),
pattern = "_", simplify = TRUE
) |>
as.data.frame()
peak_regions <- GenomicRanges::GRanges(
peak_regions[, 1],
IRanges::IRanges(
as.numeric(peak_regions[, 2]) + 1,
as.numeric(peak_regions[, 3])
)
)
idy1 <- S4Vectors::queryHits(
GenomicRanges::findOverlaps(peak_regions, blocked_regions)
)
idy2 <- grep("Y|MT|GL|KI", peak_regions)
idy <- unique(c(idy1, idy2))
matrix_readcount_use <- matrix_readcount_use[-idy, ]
dim(matrix_readcount_use)[1] 206024 87
# check memory usage
walk(list(matrix_readcount_use), \(x) {
print(object.size(x), units = "auto", standard = "SI")
})89.8 MB
R session info
devtools::session_info()─ Session info ───────────────────────────────────────────────────────────────
setting value
version R version 4.2.1 (2022-06-23)
os macOS Monterey 12.6
system aarch64, darwin21.6.0
ui unknown
language (EN)
collate en_US.UTF-8
ctype en_US.UTF-8
tz America/Chicago
date 2022-09-23
pandoc 2.19.2 @ /opt/homebrew/bin/ (via rmarkdown)
─ Packages ───────────────────────────────────────────────────────────────────
package * version date (UTC) lib source
AnnotationDbi 1.58.0 2022-04-26 [1] Bioconductor
AnnotationFilter 1.20.0 2022-04-26 [1] Bioconductor
assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.2.0)
Biobase 2.56.0 2022-04-26 [1] Bioconductor
BiocFileCache 2.4.0 2022-04-26 [1] Bioconductor
BiocGenerics 0.42.0 2022-04-26 [1] Bioconductor
BiocIO 1.6.0 2022-04-26 [1] Bioconductor
BiocParallel 1.30.3 2022-06-05 [1] Bioconductor
biomaRt 2.52.0 2022-04-26 [1] Bioconductor
Biostrings 2.64.1 2022-08-18 [1] Bioconductor
bit 4.0.4 2020-08-04 [1] CRAN (R 4.2.0)
bit64 4.0.5 2020-08-30 [1] CRAN (R 4.2.0)
bitops 1.0-7 2021-04-24 [1] CRAN (R 4.2.0)
blob 1.2.3 2022-04-10 [1] CRAN (R 4.2.0)
cachem 1.0.6 2021-08-19 [1] CRAN (R 4.2.0)
callr 3.7.2 2022-08-22 [1] CRAN (R 4.2.1)
cli 3.4.0 2022-09-08 [1] CRAN (R 4.2.1)
codetools 0.2-18 2020-11-04 [2] CRAN (R 4.2.1)
colorspace 2.0-3 2022-02-21 [1] CRAN (R 4.2.0)
crayon 1.5.1 2022-03-26 [1] CRAN (R 4.2.0)
curl 4.3.2 2021-06-23 [1] CRAN (R 4.2.0)
data.table 1.14.3 2022-09-12 [1] local
DBI 1.1.3 2022-06-18 [1] CRAN (R 4.2.0)
dbplyr 2.2.1 2022-06-27 [1] CRAN (R 4.2.1)
DelayedArray 0.22.0 2022-04-26 [1] Bioconductor
devtools 2.4.4.9000 2022-08-11 [1] Github (r-lib/devtools@c9696a6)
digest 0.6.29 2021-12-01 [1] CRAN (R 4.2.0)
dplyr * 1.0.99.9000 2022-09-21 [1] Github (tidyverse/dplyr@60f7a2d)
ellipsis 0.3.2 2021-04-29 [1] CRAN (R 4.2.0)
ensembldb 2.20.2 2022-06-16 [1] Bioconductor
evaluate 0.16 2022-08-09 [1] CRAN (R 4.2.1)
extrafont * 0.18 2022-04-12 [1] CRAN (R 4.2.0)
extrafontdb 1.0 2012-06-11 [1] CRAN (R 4.2.0)
fansi 1.0.3 2022-03-24 [1] CRAN (R 4.2.0)
fastmap 1.1.0 2021-01-25 [1] CRAN (R 4.2.0)
filelock 1.0.2 2018-10-05 [1] CRAN (R 4.2.0)
forcats * 0.5.2.9000 2022-08-20 [1] Github (tidyverse/forcats@bd319e0)
fs 1.5.2.9000 2022-08-24 [1] Github (r-lib/fs@238032f)
generics 0.1.3 2022-07-05 [1] CRAN (R 4.2.1)
GenomeInfoDb 1.32.4 2022-09-06 [1] Bioconductor
GenomeInfoDbData 1.2.8 2022-04-22 [1] Bioconductor
GenomicAlignments 1.32.1 2022-07-24 [1] Bioconductor
GenomicFeatures 1.48.3 2022-05-31 [1] Bioconductor
GenomicRanges 1.48.0 2022-04-26 [1] Bioconductor
ggplot2 * 3.3.6.9000 2022-09-12 [1] Github (tidyverse/ggplot2@a58b48c)
glue 1.6.2.9000 2022-04-22 [1] Github (tidyverse/glue@d47d6c7)
gtable 0.3.1.9000 2022-09-01 [1] Github (r-lib/gtable@c1a7a81)
hms 1.1.2 2022-08-19 [1] CRAN (R 4.2.1)
htmltools 0.5.3 2022-07-18 [1] CRAN (R 4.2.1)
htmlwidgets 1.5.4 2022-08-23 [1] Github (ramnathv/htmlwidgets@400cf1a)
httpuv 1.6.6 2022-09-08 [1] CRAN (R 4.2.1)
httr 1.4.4 2022-08-17 [1] CRAN (R 4.2.1)
IRanges 2.30.1 2022-08-18 [1] Bioconductor
jsonlite 1.8.0 2022-02-22 [1] CRAN (R 4.2.0)
KEGGREST 1.36.3 2022-07-12 [1] Bioconductor
knitr 1.40 2022-08-24 [1] CRAN (R 4.2.1)
later 1.3.0 2021-08-18 [1] CRAN (R 4.2.0)
lattice 0.20-45 2021-09-22 [2] CRAN (R 4.2.1)
lazyeval 0.2.2 2019-03-15 [1] CRAN (R 4.2.0)
lifecycle 1.0.2.9000 2022-09-09 [1] Github (r-lib/lifecycle@a2666fc)
lubridate * 1.8.0.9000 2022-05-24 [1] Github (tidyverse/lubridate@0bb49b2)
magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.2.0)
Matrix * 1.5-1 2022-09-13 [1] CRAN (R 4.2.1)
MatrixGenerics 1.8.1 2022-06-26 [1] Bioconductor
matrixStats 0.62.0 2022-04-19 [1] CRAN (R 4.2.0)
memoise 2.0.1 2021-11-26 [1] CRAN (R 4.2.0)
mime 0.12 2021-09-28 [1] CRAN (R 4.2.0)
miniUI 0.1.1.1 2018-05-18 [1] CRAN (R 4.2.0)
munsell 0.5.0 2018-06-12 [1] CRAN (R 4.2.0)
patchwork * 1.1.2.9000 2022-08-20 [1] Github (thomasp85/patchwork@c14c960)
pillar 1.8.1 2022-08-19 [1] CRAN (R 4.2.1)
pkgbuild 1.3.1 2021-12-20 [1] CRAN (R 4.2.0)
pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.2.0)
pkgload 1.3.0 2022-06-27 [1] CRAN (R 4.2.1)
png 0.1-7 2013-12-03 [1] CRAN (R 4.2.0)
prettyunits 1.1.1.9000 2022-04-22 [1] Github (r-lib/prettyunits@8706d89)
processx 3.7.0 2022-07-07 [1] CRAN (R 4.2.1)
profvis 0.3.7 2020-11-02 [1] CRAN (R 4.2.0)
progress 1.2.2 2019-05-16 [1] CRAN (R 4.2.0)
promises 1.2.0.1 2021-02-11 [1] CRAN (R 4.2.0)
ProtGenerics 1.28.0 2022-04-26 [1] Bioconductor
ps 1.7.1 2022-06-18 [1] CRAN (R 4.2.0)
purrr * 0.9000.0.9000 2022-09-21 [1] Github (tidyverse/purrr@194b8ef)
R.cache 0.16.0 2022-07-21 [1] CRAN (R 4.2.1)
R.methodsS3 1.8.2 2022-06-13 [1] CRAN (R 4.2.0)
R.oo 1.25.0 2022-06-12 [1] CRAN (R 4.2.0)
R.utils 2.12.0 2022-06-28 [1] CRAN (R 4.2.1)
R6 2.5.1.9000 2022-08-04 [1] Github (r-lib/R6@87d5e45)
ragg 1.2.2.9000 2022-09-12 [1] Github (r-lib/ragg@904e145)
rappdirs 0.3.3 2021-01-31 [1] CRAN (R 4.2.0)
Rcpp 1.0.9 2022-07-08 [1] CRAN (R 4.2.1)
RCurl 1.98-1.8 2022-07-30 [1] CRAN (R 4.2.1)
readr * 2.1.2.9000 2022-09-20 [1] Github (tidyverse/readr@5cac6ed)
remotes 2.4.2 2022-09-12 [1] Github (r-lib/remotes@bc0949d)
restfulr 0.0.15 2022-06-16 [1] CRAN (R 4.2.0)
reticulate 1.26 2022-08-31 [1] CRAN (R 4.2.1)
rjson 0.2.21 2022-01-09 [1] CRAN (R 4.2.0)
rlang 1.0.5.9000 2022-09-16 [1] Github (r-lib/rlang@5a25ff4)
rmarkdown 2.16.1 2022-08-25 [1] Github (rstudio/rmarkdown@b8a9879)
Rsamtools 2.12.0 2022-04-26 [1] Bioconductor
RSQLite 2.2.17 2022-09-10 [1] CRAN (R 4.2.1)
rtracklayer 1.56.1 2022-06-23 [1] Bioconductor
Rttf2pt1 1.3.10 2022-02-07 [1] CRAN (R 4.2.0)
S4Vectors 0.34.0 2022-04-26 [1] Bioconductor
scales 1.2.1.9000 2022-08-20 [1] Github (r-lib/scales@b3df2fb)
sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.2.0)
shiny 1.7.2 2022-07-19 [1] CRAN (R 4.2.1)
stringi 1.7.8 2022-07-11 [1] CRAN (R 4.2.1)
stringr * 1.4.1.9000 2022-08-21 [1] Github (tidyverse/stringr@792bc92)
styler * 1.7.0.9002 2022-09-21 [1] Github (r-lib/styler@1f4437b)
SummarizedExperiment 1.26.1 2022-04-29 [1] Bioconductor
systemfonts 1.0.4 2022-02-11 [1] CRAN (R 4.2.0)
textshaping 0.3.6 2021-10-13 [1] CRAN (R 4.2.0)
tibble * 3.1.8.9001 2022-09-20 [1] Github (tidyverse/tibble@b740af1)
tidyr * 1.2.1.9000 2022-09-09 [1] Github (tidyverse/tidyr@653def2)
tidyselect 1.1.2.9000 2022-09-21 [1] Github (r-lib/tidyselect@edd0a3b)
tidyverse * 1.3.2.9000 2022-09-12 [1] Github (tidyverse/tidyverse@3be8283)
tzdb 0.3.0 2022-03-28 [1] CRAN (R 4.2.0)
urlchecker 1.0.1 2021-11-30 [1] CRAN (R 4.2.0)
usethis 2.1.6.9000 2022-09-15 [1] Github (r-lib/usethis@7c34252)
utf8 1.2.2 2021-07-24 [1] CRAN (R 4.2.0)
vctrs 0.4.1.9000 2022-09-19 [1] Github (r-lib/vctrs@0a219ba)
withr 2.5.0 2022-03-03 [1] CRAN (R 4.2.0)
xfun 0.33 2022-09-12 [1] CRAN (R 4.2.1)
XML 3.99-0.10 2022-06-09 [1] CRAN (R 4.2.0)
xml2 1.3.3 2021-11-30 [1] CRAN (R 4.2.0)
xtable 1.8-4 2019-04-21 [1] CRAN (R 4.2.0)
XVector 0.36.0 2022-04-26 [1] Bioconductor
yaml 2.3.5 2022-02-21 [1] CRAN (R 4.2.0)
zlibbioc 1.42.0 2022-04-26 [1] Bioconductor
[1] /opt/homebrew/lib/R/4.2/site-library
[2] /opt/homebrew/Cellar/r/4.2.1_4/lib/R/library
─ Python configuration ───────────────────────────────────────────────────────
python: /Users/jialei/.pyenv/shims/python
libpython: /Users/jialei/.pyenv/versions/mambaforge-4.10.3-10/lib/libpython3.9.dylib
pythonhome: /Users/jialei/.pyenv/versions/mambaforge-4.10.3-10:/Users/jialei/.pyenv/versions/mambaforge-4.10.3-10
version: 3.9.13 | packaged by conda-forge | (main, May 27 2022, 17:00:33) [Clang 13.0.1 ]
numpy: /Users/jialei/.pyenv/versions/mambaforge-4.10.3-10/lib/python3.9/site-packages/numpy
numpy_version: 1.22.4
numpy: /Users/jialei/.pyenv/versions/mambaforge-4.10.3-10/lib/python3.9/site-packages/numpy
NOTE: Python version was forced by RETICULATE_PYTHON
──────────────────────────────────────────────────────────────────────────────